import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
Importing data¶
single_family_data_raw = pd.read_excel('data/Customer Watch Query billed FY19_SFR_Usage RATE OPTIONS.xlsx')
single_family_data_raw.head()
| Unnamed: 0 | FY19 | BILLING REPORT FOR SINGLE-FAMILY RESIDENTIAL | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | Unnamed: 14 | Unnamed: 15 | Unnamed: 16 | Unnamed: 17 | Unnamed: 18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | FY23 Tier 1 Rate | FY23 Tier 2 Rate | FY23 Tier 3 Rate | FY23 Monthly | FY23 TOTAL ANNUAL CURRENT BILL | Uniform Rate Option | FY23 TOTAL ANNUAL UNIFORM BILL | NaN | NaN | 100% Volume Option | NaN | NaN | NaN |
| 1 | Premise Token Number | # Days Billed Fiscal Year | Annual Billed Use (CCF) in Tier 1 | Annual Billed Use (CCF) in Tier 2 | Annual Billed Use (CCF) in Tier 3 | TOTAL ANNUAL USE CCF | 4.6 | 6.32 | 8.35 | 30.14 | MONTHLY EQUIV | 5.569654 | MONTHLY EQUIV | Monthly Savings/ Unsavings | Savings/ Unsavings % | 9.255462 | MONTHLY EQUIV | Monthly Savings/ Unsavings | Savings/ Unsavings % |
| 2 | 3856900 | 364 | 88 | 106 | 7136 | 7330 | NaN | NaN | NaN | NaN | 5085.166667 | NaN | 3432.270209 | 1652.896457 | 0.325043 | NaN | 5653.544855 | -568.378188 | -0.111772 |
| 3 | 5788640 | 366 | 88 | 108 | 3284 | 3480 | NaN | NaN | NaN | NaN | 2405.87 | NaN | 1645.339608 | 760.530392 | 0.316114 | NaN | 2684.084051 | -278.214051 | -0.11564 |
| 4 | 4577760 | 347 | 84 | 88 | 2711 | 2883 | NaN | NaN | NaN | NaN | 1995.090833 | NaN | 1368.249331 | 626.841503 | 0.314192 | NaN | 2223.624804 | -228.533971 | -0.114548 |
Data cleaning¶
Some data wrangling is necessary to turn the Excel data into a useable format for Python.
single_family_data = single_family_data_raw.copy()
# Hold on to constants defined at top of file
FY23_tier_1_rate = 4.6
FY23_tier_2_rate = 6.32
FY23_tier_3_rate = 8.35
FY23_monthly = 30.14
uniform_rate = 5.569654
volume_rate = 9.255462
# Set all rate columns to actual values
single_family_data.iloc[2:,6] = FY23_tier_1_rate
single_family_data.iloc[2:,7] = FY23_tier_2_rate
single_family_data.iloc[2:,8] = FY23_tier_3_rate
single_family_data.iloc[2:,9] = FY23_monthly
single_family_data.iloc[2:,11] = uniform_rate
single_family_data.iloc[2:,15] = volume_rate
# Moving rate column names down a row with the other column names
single_family_data.iloc[1, [6,7,8,9]] = single_family_data.iloc[0, [6,7,8,9]]
single_family_data.iloc[1, [11]] = single_family_data.iloc[0, [11]]
single_family_data.iloc[1, [15]] = single_family_data.iloc[0, [15]]
# Set the DataFrame column names to be the column name row
single_family_data.columns = single_family_data.iloc[1]
# Drop the column name row that
single_family_data = single_family_data.drop([0, 1], axis=0)
# Reset the index
single_family_data = single_family_data.reset_index(drop=True)
# Remove excess rows at the bottom with some summary data
single_family_data = single_family_data.drop(np.arange(330950, 330955), axis=0)
# Fix savings/ unsavings % column to be a percentage rather than a proportion
single_family_data['Savings/ Unsavings %'] = single_family_data['Savings/ Unsavings %']*100
# Set most columns to be of type float
single_family_data = single_family_data.astype('float')
single_family_data['Premise Token Number'] = single_family_data['Premise Token Number'].astype('int')
single_family_data.head()
| 1 | Premise Token Number | # Days Billed Fiscal Year | Annual Billed Use (CCF) in Tier 1 | Annual Billed Use (CCF) in Tier 2 | Annual Billed Use (CCF) in Tier 3 | TOTAL ANNUAL USE CCF | FY23 Tier 1 Rate | FY23 Tier 2 Rate | FY23 Tier 3 Rate | FY23 Monthly | MONTHLY EQUIV | Uniform Rate Option | MONTHLY EQUIV | Monthly Savings/ Unsavings | Savings/ Unsavings % | 100% Volume Option | MONTHLY EQUIV | Monthly Savings/ Unsavings | Savings/ Unsavings % |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3856900 | 364.0 | 88.0 | 106.0 | 7136.0 | 7330.0 | 4.6 | 6.32 | 8.35 | 30.14 | 5085.166667 | 5.569654 | 3432.270209 | 1652.896457 | 32.504273 | 9.255462 | 5653.544855 | -568.378188 | -11.177179 |
| 1 | 5788640 | 366.0 | 88.0 | 108.0 | 3284.0 | 3480.0 | 4.6 | 6.32 | 8.35 | 30.14 | 2405.870000 | 5.569654 | 1645.339608 | 760.530392 | 31.611450 | 9.255462 | 2684.084051 | -278.214051 | -11.563969 |
| 2 | 4577760 | 347.0 | 84.0 | 88.0 | 2711.0 | 2883.0 | 4.6 | 6.32 | 8.35 | 30.14 | 1995.090833 | 5.569654 | 1368.249331 | 626.841503 | 31.419196 | 9.255462 | 2223.624804 | -228.533971 | -11.454815 |
| 3 | 8527140 | 368.0 | 88.0 | 90.0 | 2601.0 | 2779.0 | 4.6 | 6.32 | 8.35 | 30.14 | 1921.135833 | 5.569654 | 1319.978998 | 601.156836 | 31.291740 | 9.255462 | 2143.410798 | -222.274965 | -11.569977 |
| 4 | 4107220 | 364.0 | 87.0 | 83.0 | 2549.0 | 2719.0 | 4.6 | 6.32 | 8.35 | 30.14 | 1880.882500 | 5.569654 | 1292.130728 | 588.751772 | 31.301890 | 9.255462 | 2097.133487 | -216.250987 | -11.497315 |
# Sanity checks
num_accounts_billed_for_tier_1_use = np.count_nonzero(single_family_data['Annual Billed Use (CCF) in Tier 1'])
num_accounts_billed_for_tier_2_use = np.count_nonzero(single_family_data['Annual Billed Use (CCF) in Tier 2'])
num_accounts_billed_for_tier_3_use = np.count_nonzero(single_family_data['Annual Billed Use (CCF) in Tier 3'])
tier_1_ccf_tot = np.sum(single_family_data['Annual Billed Use (CCF) in Tier 1'])
tier_2_ccf_tot = np.sum(single_family_data['Annual Billed Use (CCF) in Tier 2'])
tier_3_ccf_tot = np.sum(single_family_data['Annual Billed Use (CCF) in Tier 3'])
print('# of Accounts billed for')
print(f'Tier 1 Use in Fiscal Year: {num_accounts_billed_for_tier_1_use}')
print(f'Tier 2 Use in Fiscal Year: {num_accounts_billed_for_tier_2_use}')
print(f'Tier 3 Use in Fiscal Year: {num_accounts_billed_for_tier_3_use}\n')
print('Total Use (CCF) in')
print(f'Tier 1: {tier_1_ccf_tot}')
print(f'Tier 2: {tier_2_ccf_tot}')
print(f'Tier 3: {tier_3_ccf_tot}')
print(f'Combined: {tier_1_ccf_tot + tier_2_ccf_tot + tier_3_ccf_tot}')
# of Accounts billed for Tier 1 Use in Fiscal Year: 329507 Tier 2 Use in Fiscal Year: 182878 Tier 3 Use in Fiscal Year: 71520 Total Use (CCF) in Tier 1: 20377678.0 Tier 2: 6835705.0 Tier 3: 5261988.0 Combined: 32475371.0
Exploration¶
How much water is used in each tier?¶
Looking at these water usage distributions, we can observe that most people use between 80 and 90 CCF in tier 1 and very little in tiers 2 and 3. This aligns with expectations of normal water use being 7 CCF per month for tier 1, which 84 CCF annually.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15,4))
sns.histplot(data=single_family_data, x='Annual Billed Use (CCF) in Tier 1', ax=ax1, binwidth=5, color='blue');
sns.histplot(data=single_family_data, x='Annual Billed Use (CCF) in Tier 2', ax=ax2, binwidth=5, color='orange');
sns.histplot(data=single_family_data, x='Annual Billed Use (CCF) in Tier 3', ax=ax3, color='green');
ax1.set_title('Tier 1')
ax2.set_title('Tier 2')
ax3.set_title('Tier 3')
plt.suptitle('Tier 1/2/3 Annual Billed Water Use (CCF) Single Family Residential Distribution')
plt.show()
How much are single family residents paying for water under the current rate structure?¶
The plot below has a long right tail. There are a couple of outliers samples that are paying significantly more per month compared to most other single family residential customers, causing the visualization to be very stretch out.
fig, ax = plt.subplots(1, 1, figsize=(15,4))
sns.histplot(x=single_family_data['MONTHLY EQUIV'].iloc[:,0], ax=ax, binwidth=5, color='blue');
ax.set_xlabel('Monthly Bill (dollars)')
ax.set_title('Single Family Residential Monthly Bill Distribution Under Tiered Rate Structure');
Identifying outliers¶
I used a 1.5*IQR cutoff to capture outlier samples based on the monthly bill using the current tiered rate structure.
def IQR_flag_outliers(arr):
# return a boolean array with True for outliers and False for inliers
q1 = np.percentile(arr, 25)
q3 = np.percentile(arr, 75)
iqr = q3-q1
return (arr < q1 - 1.5*iqr) | (arr > q3 + 1.5*iqr)
According to this, there were 28177 samples that were identified as being outliers. Of these outlier samples, the max monthly bill was $5085.17. This is very high, so it is good that it was captured as an outlier. The minimum monthly bill classified as an outlier was $134.76. Looking at the below histogram, 134.76 could actually probably be considered an inlier. However, for the purposes of exploration, it works fine as a cutoff to show the interesting part of the distribution. Also, $5085.17 is a massive monthly bill. I would be interested in learning more about that sample.
# Identify outliers
outliers = IQR_flag_outliers(single_family_data['MONTHLY EQUIV'].iloc[:,0])
single_family_data[outliers].describe()[['Premise Token Number', 'Annual Billed Use (CCF) in Tier 1', 'Annual Billed Use (CCF) in Tier 2', 'Annual Billed Use (CCF) in Tier 3', 'MONTHLY EQUIV']]
| 1 | Premise Token Number | Annual Billed Use (CCF) in Tier 1 | Annual Billed Use (CCF) in Tier 2 | Annual Billed Use (CCF) in Tier 3 | MONTHLY EQUIV | MONTHLY EQUIV | MONTHLY EQUIV |
|---|---|---|---|---|---|---|---|
| count | 2.817700e+04 | 28177.000000 | 28177.000000 | 28177.000000 | 28177.000000 | 28177.000000 | 28177.000000 |
| mean | 5.260086e+06 | 83.575363 | 80.925045 | 158.225822 | 214.896548 | 179.929449 | 248.915037 |
| std | 2.229513e+06 | 6.389325 | 16.425303 | 156.818533 | 110.919488 | 74.588991 | 123.949462 |
| min | 2.107000e+03 | 14.000000 | 13.000000 | 12.000000 | 134.759167 | 108.579291 | 130.347760 |
| 25% | 3.351680e+06 | 81.000000 | 70.000000 | 74.000000 | 152.815833 | 138.284112 | 179.710225 |
| 50% | 5.256600e+06 | 87.000000 | 78.000000 | 112.000000 | 180.970000 | 157.313762 | 211.333055 |
| 75% | 7.172566e+06 | 88.000000 | 93.000000 | 187.000000 | 235.719167 | 193.980650 | 272.264848 |
| max | 9.969246e+06 | 129.000000 | 161.000000 | 7136.000000 | 5085.166667 | 3432.270209 | 5653.544855 |
Most single family residential families spend between 40 and 60 dollars monthly on water. After removing the outliers, it is more clear that the distribution is fairly wide. The variations in monthly bill could be caused by the differences in tier 1 and tier 2 water usage shown in Annual Billed Water Use histograms from above.
# Remove outliers, then plot again
outliers_removed = single_family_data[~outliers]
fig, ax = plt.subplots(1, 1, figsize=(15,4))
sns.histplot(x=outliers_removed['MONTHLY EQUIV'].iloc[:,0], ax=ax, binwidth=5, color='blue');
ax.set_xlabel('Monthly Bill (dollars)')
ax.set_title('Single Family Residential Monthly Bill Distribution Under Tiered Rate Structure');
Comparing across rate structures¶
Monthly equivalent bill for each rate structure¶
The rates in each of the rate structures can be changed arbitratily to achieve any revenue goal. In the provided data, the rates for the uniform and volume option were chosen to earn the same revenue as the current tiered structure. Since the amount of revenue gained from each rate structure is the same, we can compare the histograms for monthly bill to see how each rate structure distributes costs across the single residential families.
long_df = pd.DataFrame(
columns=['MONTHLY EQUIV'],
data=np.vstack((
single_family_data['MONTHLY EQUIV'].iloc[:,0:1],
single_family_data['MONTHLY EQUIV'].iloc[:,1:2],
single_family_data['MONTHLY EQUIV'].iloc[:,2:3],
))
)
long_df['Rate Structure'] = np.repeat(['current', 'uniform', 'volume'], single_family_data.shape[0])
fig, ax = plt.subplots(1, 1, figsize=(15,4))
sns.histplot(data=long_df, x='MONTHLY EQUIV', ax=ax, binwidth=5, hue='Rate Structure');
ax.set_xlabel('Monthly Bill (dollars)')
ax.set_title('Single Family Residential Monthly Bill Distribution Under Different Rate Structures');
There is the same issue as before with the outliers. Let's remove those and re-plot.
The volume rate structure distribution is more spread out compared to the other two rate structures which indicates that there is greater variance in what people pay for water under that rate structure. One possible explanation that I can think of for this is that it is more dependent on water use (which has high variance) compared to the other two rate structures which have a monthly service fee.
long_df = pd.DataFrame(
columns=['MONTHLY EQUIV'],
data=np.vstack((
outliers_removed['MONTHLY EQUIV'].iloc[:,0:1],
outliers_removed['MONTHLY EQUIV'].iloc[:,1:2],
outliers_removed['MONTHLY EQUIV'].iloc[:,2:3],
))
)
long_df['Rate Structure'] = np.repeat(['current', 'uniform', 'volume'], outliers_removed.shape[0])
fig, ax = plt.subplots(1, 1, figsize=(15,4))
sns.histplot(data=long_df, x='MONTHLY EQUIV', ax=ax, binwidth=5, hue='Rate Structure');
ax.set_xlabel('Monthly Bill (dollars)')
ax.set_title('Single Family Residential Monthly Bill Distribution Under Different Rate Structures');
# SAME PLOT BUT IN PLOTLY
long_df = pd.DataFrame(
columns=['MONTHLY EQUIV'],
data=np.vstack((
outliers_removed['MONTHLY EQUIV'].iloc[:,0:1],
outliers_removed['MONTHLY EQUIV'].iloc[:,1:2],
outliers_removed['MONTHLY EQUIV'].iloc[:,2:3],
))
)
long_df['Rate Structure'] = np.repeat(['current', 'uniform', 'volume'], outliers_removed.shape[0])
fig = go.Figure()
fig.add_trace(go.Histogram(x=long_df[long_df['Rate Structure'] == 'current']['MONTHLY EQUIV'], xbins={'size':5}, marker_color='blue', opacity=0.8, name='current'))
fig.add_trace(go.Histogram(x=long_df[long_df['Rate Structure'] == 'uniform']['MONTHLY EQUIV'], xbins={'size':5}, marker_color='orange', opacity=0.8, name='uniform'))
fig.add_trace(go.Histogram(x=long_df[long_df['Rate Structure'] == 'volume']['MONTHLY EQUIV'], xbins={'size':5}, marker_color='green', opacity=0.8, name='volume'))
fig.update_layout(
title="Single Family Residential Monthly Bill Distribution Under Different Rate Structures",
xaxis_title="Monthly Bill (dollars)",
yaxis_title="Count",
barmode='overlay',
width=1200,
height=400)
fig.show()